Sample SQL to check for duplicate names
select names, count(*) as num_rows
from bloomington
group by names
having num_rows > 1
Sample SQL to remove duplicate names
select names, departments, max(salaries) as salaries
from bloomington
group by names, departments
Sample SQL to create column labeling part time or full time
select names
,departments
,salaries
,case when salaries < 25000 then 'Part Time'
else 'Full Time' end as emp_type
from bloomington
#filter the chicago data onto only full time employees
temp <- chicago %>% filter(emp_type == 'Full Time')
#fit a simple linear model with no intercept
glm_fit <- lm(salaries ~ 0 + departments, data=temp)
#get unique values of the departments
depts <- tibble(levels(chicago$departments))
colnames(depts) = c('departments')
#predict the mean and 95% prediction interval for each dept
preds <- data.frame(predict(glm_fit, newdata=depts, interval = "prediction"))
#combine back with unique list of depts
depts <- cbind(depts, preds)
#join back into main dataset
chic_final <- left_join(chicago, depts, by = c('departments')) %>%
filter(emp_type == 'Full Time')
select chi.*
,pred.lwr
,pred.upper
from chicago as chi
left join predictions as pred
on chi.departments = pred.departments